This step loads the required libraries and the dataset:
#Required for plotting
options(bitmapType = "cairo")
#Load libraries
library(dplyr, quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE)
library(Seurat, quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE)
##
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
##
## intersect
library(patchwork, quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE)
# Load dataset
pbmc <- CreateSeuratObject(
#Data directory
counts = Read10X(data.dir = "./data/filtered_matrices_mex/hg19/"),
#Dataset name
project = "pbmc68k",
#Omit features detected in <5 cells
min.cells = 5,
#Omit cells with <200 featues
min.features = 200
)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
This step carries out the 4-stage preprocessing of the dataset: QC, normalization, feature selection, centering and scaling.
First, QC metrics are visualized:
#Get QC metrics
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
#Plot QC metrics
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt") +
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
Based on them, inferior-quality data is filtered out of the dataset:
pbmc <- subset(pbmc,
#Filter out data from cells which:
#1. have less than 300 or more than 2000 features, or
#2. have more than 4% mitochondrial counts
subset = nFeature_RNA > 300 & nFeature_RNA < 2000 & percent.mt < 4)
pbmc <- NormalizeData(pbmc)
## Normalizing layer: counts
pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst', nfeatures = 2000)
## Finding variable features for layer counts
#Plot the relevant features
varfts_plot <- VariableFeaturePlot(pbmc)
varfts_plot + LabelPoints(plot = varfts_plot, points = head(VariableFeatures(pbmc), 10), repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
Due to ceaseless OOM errors in this step, I had to restrict the
scaled fetures to only the variable features (by omitting the
features argument passed to
Seurat::ScaleData()). This shouldn’t affect the PCA and
clustering results, but the heatmaps (see below) will likely be off.
I’ve included them nonetheless.
pbmc <- ScaleData(pbmc,
#Helps prevent OOM errors
block.size=1)
## Centering and scaling data matrix
In this step, PCA of the pre-processed data is carried out.
First, carry out the PCA proper:
#Carry out the PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: CST3, SPI1, SERPINA1, LST1, FCN1, LYZ, CFD, RP11-290F20.3, CD68, IFI30
## MS4A7, PILRA, HCK, AIF1, TMEM176B, TYMP, FCER1G, LRRC25, CFP, HLA-DRB1
## HLA-DRA, TYROBP, HLA-DRB5, HLA-DPA1, S100A9, IGSF6, HMOX1, LILRB2, HLA-DPB1, C19orf38
## Negative: RPS6, RPL13, RPS18, RPS2, LTB, IL32, RPL34, CD7, IL7R, CCR7
## JUN, CD8B, AQP3, RGCC, RPLP1, EIF3E, PASK, RP11-291B21.2, CTSW, NGFRAP1
## CD8A, GZMK, MYC, CCL5, FKBP11, RP11-664D1.1, HIST1H1D, CRIP2, DUSP2, JUNB
## PC_ 2
## Positive: LTB, RPL13, RPS2, CD79A, RPS18, RPS6, RPL34, HLA-DRA, RPLP1, TCL1A
## CD79B, MS4A1, HLA-DQA1, LINC00926, JUNB, VPREB3, FCER2, HLA-DQA2, HLA-DMA, LY86
## HLA-DMB, HLA-DOB, SPIB, BANK1, HVCN1, CCR7, FCRLA, HLA-DQB1, EAF2, HLA-DPA1
## Negative: NKG7, GNLY, CST7, GZMB, GZMA, FGFBP2, CCL5, GZMH, PRF1, CTSW
## HOPX, CLIC3, SPON2, FCGR3A, CCL4, TYROBP, SRGN, MATK, PRSS23, CD63
## S1PR5, PFN1, KLRD1, IGFBP7, GPR56, TMSB4X, S100A4, FCRL6, C1orf21, IFITM2
## PC_ 3
## Positive: CD79A, MS4A1, TCL1A, CD79B, HLA-DRA, HLA-DQA1, HLA-DPB1, HLA-DRB1, CD74, HLA-DPA1
## HLA-DQA2, HLA-DRB5, LINC00926, VPREB3, FCER2, SPIB, HLA-DMB, HLA-DQB1, HLA-DMA, HLA-DOB
## BANK1, PDLIM1, FCRLA, HVCN1, IRF8, BLK, GZMB, LY86, CD22, PNOC
## Negative: JUNB, RP11-290F20.3, SERPINA1, AIF1, CFD, RPS6, MS4A7, TMEM176B, PILRA, IL7R
## RPL13, FCN1, RPL34, RPS2, LST1, HES4, LILRB2, CD68, CDKN1C, IFI30
## LRRC25, RPS18, HMOX1, GPBAR1, LTB, C1QA, VMO1, CSF1R, CFP, MAFB
## PC_ 4
## Positive: RPS2, RPL13, RPS6, FCGR3A, RPS18, RP11-290F20.3, MS4A7, CD79B, CD79A, TCL1A
## NKG7, SERPINA1, MS4A1, GNLY, HES4, HMOX1, LINC00926, PILRA, CFD, CDKN1C
## VMO1, FCER2, VPREB3, TESC, FGFBP2, HVCN1, CST7, IFITM3, LILRA3, LRRC25
## Negative: PPBP, CLU, GNG11, PF4, SDPR, NRGN, PTCRA, HIST1H2AC, TUBB1, GP9
## ACRBP, SPARC, CMTM5, TREML1, NCOA4, RUFY1, ITGA2B, SNCA, RGS18, MPP1
## AP001189.4, MYL9, CD9, CLDN5, LMNA, AC147651.3, RAB32, GPX1, GSN, MAP3K7CL
## PC_ 5
## Positive: MS4A7, RP11-290F20.3, CD79B, VMO1, CD79A, FCGR3A, CDKN1C, HMOX1, HES4, CD68
## HIST1H2AC, CKB, TCL1A, MS4A1, ICAM4, LYN, PPBP, GNG11, LILRA3, SDPR
## SERPINA1, PF4, SIGLEC10, C5AR1, PILRA, CLU, TESC, MPP1, LYL1, LINC00926
## Negative: FCER1A, LGALS2, CLEC10A, LYZ, ALDH2, S100A8, CPVL, CD1C, ENHO, MS4A6A
## S100A9, GRN, CST3, CD14, RNASE6, RPS2, CAPG, AMICA1, IGSF6, S100A4
## BLVRB, CD33, LGALS1, HLA-DMA, CACNA2D3, RPLP1, SERPINF1, CD1D, TMSB10, FCGRT
#Plot PCA results
VizDimLoadings(pbmc, dims = 1:2, reduction = 'pca')
DimPlot(pbmc, reduction = 'pca')
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
Next, find which of the principal components are actually significant:
#Carry out JackStraw
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
#Plot JackStraw results
JackStrawPlot(pbmc, dims = 1:20)
## Warning: Removed 28718 rows containing missing values or values outside the scale range
## (`geom_point()`).
#Also produce an elbow plot
ElbowPlot(pbmc)
Based on this, I assumed a cutoff between the 10th and 11th PCs.
This step carries out the clustering and tSNE of the dataset.
Firstly, clustering:
pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc,
#Value picked to net 10 clusters
resolution = 0.15)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 65251
## Number of edges: 1769153
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9518
## Number of communities: 10
## Elapsed time: 58 seconds
Secondly, tSNE:
pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 23:01:42 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:01:42 Read 65251 rows and found 10 numeric columns
## 23:01:42 Using Annoy for neighbor search, n_neighbors = 30
## 23:01:42 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:01:58 Writing NN index file to temp file /tmp/RtmpnrYO0C/file64da12d4fc56
## 23:01:58 Searching Annoy index using 1 thread, search_k = 3000
## 23:03:13 Annoy recall = 100%
## 23:03:15 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:03:19 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:03:31 Commencing optimization for 200 epochs, with 2786446 positive edges
## 23:04:36 Optimization finished
Lastly, the results of tSNE are visualized as in Zheng et al. 2017.
#Retrieve the number of cells in each cluster
ccc <- table(Idents(pbmc))
#Transform absolute counts into percentages
ccc <- ccc / sum(ccc) * 100
#Append cluster IDs to percentages
ccc <- sprintf("%s: %.2f%%", levels(pbmc), ccc)
#Apply these percentages as cluster names
names(ccc) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, ccc)
#Plot
DimPlot(pbmc, reduction = 'umap', label=TRUE) + NoLegend()
#Restore nondescript cluster names
oldcls <- names(ccc)
names(oldcls) <- as.character(ccc)
pbmc <- RenameIdents(pbmc, oldcls)
Here, each of the 10 clusters is independently re-clustered.
The following helper function will be used to automatically re-cluster and plot each cluster:
#Arguments:
# cluster: Name of the cluster to re-cluster
# resolution: Passed to Seurat::FindClusters(); can be changed to alter the number of created sub-clusters
recluster <- function(cluster, resolution) {
#Retrieve the requested cluster
cl <- subset(x = pbmc, idents = cluster)
#Carry out clustering and tSNE
cl <- FindNeighbors(cl, dims = 1:10, verbose=FALSE)
cl <- FindClusters(cl, resolution=resolution, verbose=FALSE)
cl <- RunUMAP(cl, dims = 1:10, verbose=FALSE)
#Format sub-cluster names
ccc <- table(Idents(cl))
ccc <- ccc / sum(ccc) * 100
ccc <- sprintf("%s/%s: %.2f%%", cluster, levels(cl), ccc)
names(ccc) <- levels(cl)
cl <- RenameIdents(cl, ccc)
#Plot
return(DimPlot(cl, reduction = 'umap', label=TRUE) + NoLegend())
}
Now, each cluster will be re-clustered proper.
Cluster no. 0, for 2 sub-clusters:
recluster("0", 0.15)
Cluster no. 1, for 2 sub-clusters:
recluster("1", 0.2)
Cluster no. 2, for 3 sub-clusters:
recluster("2", 0.2)
Cluster no. 3, for 1 sub-cluster, so no actual further clustering:
recluster("3", 0.1)
Cluster no. 4, for 2 sub-clusters:
recluster("4", 0.1)
Cluster no. 5, for 3 sub-clusters:
recluster("5", 0.1)
Cluster no. 6, for 4 sub-clusters:
recluster("6", 0.2)
Cluster no. 7, for 2 sub-clusters:
recluster("7", 0.25)
Cluster no. 8, for 2 sub-clusters:
recluster("8", 0.1)
Cluster no. 9, for 2 sub-clusters:
recluster("9", 0.5)